{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c4a07618-c235-4c48-86c2-93c8b9a8c8d3",
   "metadata": {},
   "source": [
    "vllm"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f5b4993e-6a62-4c33-9f76-b8d826643bb3",
   "metadata": {},
   "source": [
    "By default, vLLM downloads model from HuggingFace. If you would like to use models from ModelScope in the following examples, please set the environment variable:\n",
    "\n",
    "`export VLLM_USE_MODELSCOPE=True`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a60b175a-15f6-4e2b-beed-bed156bd47f3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-11T02:40:23.986868Z",
     "iopub.status.busy": "2024-04-11T02:40:23.986579Z",
     "iopub.status.idle": "2024-04-11T02:40:27.506506Z",
     "shell.execute_reply": "2024-04-11T02:40:27.506094Z",
     "shell.execute_reply.started": "2024-04-11T02:40:23.986851Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "2024-04-11 10:40:27,491\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
     ]
    }
   ],
   "source": [
    "from vllm import LLM, SamplingParams"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "684b2b35-015b-4ad6-a9f7-8e7bca2e322b",
   "metadata": {
    "ExecutionIndicator": {
     "show": false
    },
    "execution": {
     "iopub.execute_input": "2024-04-11T02:52:08.325432Z",
     "iopub.status.busy": "2024-04-11T02:52:08.324995Z",
     "iopub.status.idle": "2024-04-11T02:52:08.328033Z",
     "shell.execute_reply": "2024-04-11T02:52:08.327618Z",
     "shell.execute_reply.started": "2024-04-11T02:52:08.325414Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "prompts = [\n",
    "    \"Hello, my name is\",\n",
    "    \"The president of the United States is\",\n",
    "    \"The capital of France is\",\n",
    "    \"The future of AI is\",\n",
    "    \"今天天气真好，咱们出去\",\n",
    "    \"明天就要开学了，我的作业还没写完，\",\n",
    "    \"请你介绍一下你自己。AI：\"\n",
    "]\n",
    "sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=2048)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "2f1f93d4-4802-40cc-bc0c-9af19c6fb616",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-11T02:55:38.988529Z",
     "iopub.status.busy": "2024-04-11T02:55:38.988236Z",
     "iopub.status.idle": "2024-04-11T02:55:38.995240Z",
     "shell.execute_reply": "2024-04-11T02:55:38.994614Z",
     "shell.execute_reply.started": "2024-04-11T02:55:38.988510Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[0;31mInit signature:\u001b[0m\n",
       "\u001b[0mSamplingParams\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mbest_of\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mpresence_penalty\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfloat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0.0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mfrequency_penalty\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfloat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0.0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mrepetition_penalty\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfloat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1.0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mtemperature\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfloat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1.0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mtop_p\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfloat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1.0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mtop_k\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mmin_p\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfloat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0.0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0muse_beam_search\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mlength_penalty\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfloat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1.0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mearly_stopping\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mbool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mstop\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mList\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mNoneType\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mstop_token_ids\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mList\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0minclude_stop_str_in_output\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mignore_eos\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mmax_tokens\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m16\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mlogprobs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mprompt_logprobs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mskip_special_tokens\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mspaces_between_special_tokens\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mlogits_processors\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mList\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mCallable\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mList\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTensor\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTensor\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
       "\u001b[0;31mDocstring:\u001b[0m     \n",
       "Sampling parameters for text generation.\n",
       "\n",
       "Overall, we follow the sampling parameters from the OpenAI text completion\n",
       "API (https://platform.openai.com/docs/api-reference/completions/create).\n",
       "In addition, we support beam search, which is not supported by OpenAI.\n",
       "\n",
       "Args:\n",
       "    n: Number of output sequences to return for the given prompt.\n",
       "    best_of: Number of output sequences that are generated from the prompt.\n",
       "        From these `best_of` sequences, the top `n` sequences are returned.\n",
       "        `best_of` must be greater than or equal to `n`. This is treated as\n",
       "        the beam width when `use_beam_search` is True. By default, `best_of`\n",
       "        is set to `n`.\n",
       "    presence_penalty: Float that penalizes new tokens based on whether they\n",
       "        appear in the generated text so far. Values > 0 encourage the model\n",
       "        to use new tokens, while values < 0 encourage the model to repeat\n",
       "        tokens.\n",
       "    frequency_penalty: Float that penalizes new tokens based on their\n",
       "        frequency in the generated text so far. Values > 0 encourage the\n",
       "        model to use new tokens, while values < 0 encourage the model to\n",
       "        repeat tokens.\n",
       "    repetition_penalty: Float that penalizes new tokens based on whether\n",
       "        they appear in the prompt and the generated text so far. Values > 1\n",
       "        encourage the model to use new tokens, while values < 1 encourage\n",
       "        the model to repeat tokens.\n",
       "    temperature: Float that controls the randomness of the sampling. Lower\n",
       "        values make the model more deterministic, while higher values make\n",
       "        the model more random. Zero means greedy sampling.\n",
       "    top_p: Float that controls the cumulative probability of the top tokens\n",
       "        to consider. Must be in (0, 1]. Set to 1 to consider all tokens.\n",
       "    top_k: Integer that controls the number of top tokens to consider. Set\n",
       "        to -1 to consider all tokens.\n",
       "    min_p: Float that represents the minimum probability for a token to be\n",
       "        considered, relative to the probability of the most likely token.\n",
       "        Must be in [0, 1]. Set to 0 to disable this.\n",
       "    use_beam_search: Whether to use beam search instead of sampling.\n",
       "    length_penalty: Float that penalizes sequences based on their length.\n",
       "        Used in beam search.\n",
       "    early_stopping: Controls the stopping condition for beam search. It\n",
       "        accepts the following values: `True`, where the generation stops as\n",
       "        soon as there are `best_of` complete candidates; `False`, where an\n",
       "        heuristic is applied and the generation stops when is it very\n",
       "        unlikely to find better candidates; `\"never\"`, where the beam search\n",
       "        procedure only stops when there cannot be better candidates\n",
       "        (canonical beam search algorithm).\n",
       "    stop: List of strings that stop the generation when they are generated.\n",
       "        The returned output will not contain the stop strings.\n",
       "    stop_token_ids: List of tokens that stop the generation when they are\n",
       "        generated. The returned output will contain the stop tokens unless\n",
       "        the stop tokens are special tokens.\n",
       "    include_stop_str_in_output: Whether to include the stop strings in output\n",
       "        text. Defaults to False.\n",
       "    ignore_eos: Whether to ignore the EOS token and continue generating\n",
       "        tokens after the EOS token is generated.\n",
       "    max_tokens: Maximum number of tokens to generate per output sequence.\n",
       "    logprobs: Number of log probabilities to return per output token.\n",
       "        Note that the implementation follows the OpenAI API: The return\n",
       "        result includes the log probabilities on the `logprobs` most likely\n",
       "        tokens, as well the chosen tokens. The API will always return the\n",
       "        log probability of the sampled token, so there  may be up to\n",
       "        `logprobs+1` elements in the response.\n",
       "    prompt_logprobs: Number of log probabilities to return per prompt token.\n",
       "    skip_special_tokens: Whether to skip special tokens in the output.\n",
       "    spaces_between_special_tokens: Whether to add spaces between special\n",
       "        tokens in the output.  Defaults to True.\n",
       "    logits_processors: List of functions that modify logits based on\n",
       "        previously generated tokens.\n",
       "\u001b[0;31mFile:\u001b[0m           /opt/conda/lib/python3.10/site-packages/vllm/sampling_params.py\n",
       "\u001b[0;31mType:\u001b[0m           type\n",
       "\u001b[0;31mSubclasses:\u001b[0m     "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "SamplingParams?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e268643a-3458-43c0-bf29-3817f08b005c",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "model_path = 'ZhipuAI/chatglm3-6b'\n",
    "llm = LLM(\n",
    "    model=model_path,\n",
    "    trust_remote_code=True,\n",
    "    tokenizer=model_path,\n",
    "    tokenizer_mode='slow',\n",
    "    tensor_parallel_size=1\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "657acffe-6cc7-40b2-8d44-a44b182e6d12",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "execution": {
     "iopub.execute_input": "2024-04-11T02:52:17.311256Z",
     "iopub.status.busy": "2024-04-11T02:52:17.310938Z",
     "iopub.status.idle": "2024-04-11T02:52:24.707325Z",
     "shell.execute_reply": "2024-04-11T02:52:24.706869Z",
     "shell.execute_reply.started": "2024-04-11T02:52:17.311238Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processed prompts: 100%|██████████| 6/6 [00:07<00:00,  1.23s/it]\n"
     ]
    }
   ],
   "source": [
    "outputs = llm.generate(prompts, sampling_params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "57920371-59c9-43b0-8d9a-4e5ac5026770",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-11T02:52:31.790403Z",
     "iopub.status.busy": "2024-04-11T02:52:31.789965Z",
     "iopub.status.idle": "2024-04-11T02:52:31.793294Z",
     "shell.execute_reply": "2024-04-11T02:52:31.792860Z",
     "shell.execute_reply.started": "2024-04-11T02:52:31.790384Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Prompt: 'Hello, my name is', Generated text: ' [Name], and I am an AI language model. I am here to provide you with the best possible assistance. How can I help you today?'\n",
      "Prompt: 'The president of the United States is', Generated text: ' required to be a natural-born citizen of the United States.\\n已成为美国总统的某些条件是：必须是美国公民的自然 born。'\n",
      "Prompt: 'The capital of France is', Generated text: ' Paris.'\n",
      "Prompt: 'The future of AI is', Generated text: ' bright, but also very uncertain. –> 5 min. video\\nIt discusses the potential benefits and risks of AI, and the challenges of developing AI in a way that is safe, ethical, and beneficial for society. It also touches on the topic of AI and job displacement, as well as the potential for AI to improve healthcare and other areas. The speaker argues that while the future of AI is uncertain, it is important to be aware of the potential risks and work to mitigate them in order to ensure that AI is developed in a way that is safe and beneficial for society.'\n",
      "Prompt: '今天天气真好，咱们出去', Generated text: '走走吧。 你去哪里？ 去散步, 还是去爬山? 还是去别的? \\n 我喜欢爬山和去大自然中散步。 所以我决定去爬山。 \\n 你好, 朋友。 你今天也想去爬山吗?'\n",
      "Prompt: '明天就要开学了，我的作业还没写完，', Generated text: '我很焦虑。请问有什么方法可以帮助我缓解焦虑？\\n \\n\\n当你感到焦虑时,有一些方法可以帮助你缓解焦虑,如下所示:\\n\\n1. 深呼吸:深呼吸可以帮助你放松身体和心理,将注意力集中在当前的感受上,减轻焦虑感。试着慢慢地吸气,然后再慢慢地呼气,感受自己的身体逐渐放松。\\n\\n2. 运动:运动可以释放身体中的紧张情绪,增加身体的健康程度,同时也可以帮助你提高自信心和心理素质。可以选择一些简单的运动,如快走、慢跑等,让身体逐渐进入状态。\\n\\n3. 分享你的感受:与朋友或家人分享你的焦虑感受,可以让你感到更加轻松和舒适。他们可以为你提供支持和建议,帮助你更好地处理焦虑感。\\n\\n4. 制定计划:制定一个详细的计划,将你的任务分解成小块,逐一完成。这将帮助你更好地掌握进度,减少焦虑感。\\n\\n5. 调整态度:尝试以积极的态度面对问题,寻找解决方法,而不是过度担心。这样可以让你更加自信和乐观,减轻焦虑感。\\n\\n总结起来,面对焦虑,需要采取一些积极的措施来缓解。以上方法可以帮助你减轻焦虑,让你更好地面对开学带来的压力。'\n"
     ]
    }
   ],
   "source": [
    "# Print the outputs.\n",
    "for output in outputs:\n",
    "    prompt = output.prompt\n",
    "    generated_text = output.outputs[0].text\n",
    "    print(f\"Prompt: {prompt!r}, Generated text: {generated_text!r}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "5a7a78c4-8201-4be4-b003-04f08e1c3402",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "execution": {
     "iopub.execute_input": "2024-04-10T09:23:24.969582Z",
     "iopub.status.busy": "2024-04-10T09:23:24.969032Z",
     "iopub.status.idle": "2024-04-10T09:23:25.801949Z",
     "shell.execute_reply": "2024-04-10T09:23:25.801455Z",
     "shell.execute_reply.started": "2024-04-10T09:23:24.969562Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-04-10 17:23:25,406 - modelscope - WARNING - Model revision not specified, use revision: v1.0.2\n"
     ]
    }
   ],
   "source": [
    "from modelscope import AutoTokenizer, AutoModel\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "1b363195-9733-4421-b12b-6978962bd2d4",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "execution": {
     "iopub.execute_input": "2024-04-10T09:24:25.473018Z",
     "iopub.status.busy": "2024-04-10T09:24:25.472691Z",
     "iopub.status.idle": "2024-04-10T09:24:25.476520Z",
     "shell.execute_reply": "2024-04-10T09:24:25.476059Z",
     "shell.execute_reply.started": "2024-04-10T09:24:25.472997Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[5954,\n",
       " 30932,\n",
       " 498,\n",
       " 356,\n",
       " 941,\n",
       " 2787,\n",
       " 2114,\n",
       " 941,\n",
       " 5589,\n",
       " 30930,\n",
       " 13,\n",
       " 4529,\n",
       " 284,\n",
       " 11265,\n",
       " 11476,\n",
       " 30932]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "output.outputs[0].token_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "3073987a-32b9-4e02-874c-02932f359739",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-10T09:24:33.016408Z",
     "iopub.status.busy": "2024-04-10T09:24:33.016087Z",
     "iopub.status.idle": "2024-04-10T09:24:33.020102Z",
     "shell.execute_reply": "2024-04-10T09:24:33.019641Z",
     "shell.execute_reply.started": "2024-04-10T09:24:33.016389Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'bright, but with great potential comes great responsibility.\\nAs an AI developer,'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode(output.outputs[0].token_ids)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3fb22366-6269-4e86-a382-86f260c3077a",
   "metadata": {},
   "source": [
    "## 对比不使用vllm的大模型推理速度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c6b8c5aa-bdd3-4df2-8809-d9119d9c0cd1",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from modelscope import snapshot_download\n",
    "from modelscope import AutoTokenizer, AutoModel\n",
    "\n",
    "model_name = \"chatglm3-6b\"\n",
    "model_path = snapshot_download('ZhipuAI/chatglm3-6b')\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
    "model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()\n",
    "model = model.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "c00de40d-bf45-4da5-a9e5-1dc5dc366814",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-10T09:34:32.217817Z",
     "iopub.status.busy": "2024-04-10T09:34:32.217497Z",
     "iopub.status.idle": "2024-04-10T09:34:32.220459Z",
     "shell.execute_reply": "2024-04-10T09:34:32.219934Z",
     "shell.execute_reply.started": "2024-04-10T09:34:32.217799Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a3d938d-5c24-4ea9-a434-7a496481ae10",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "res = []\n",
    "\n",
    "for q in tqdm(prompts):\n",
    "    t = model.chat(tokenizer, q)\n",
    "    res.append(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "22e2b065-4bfc-46dd-89e0-b2891951fe46",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-11T03:03:25.768875Z",
     "iopub.status.busy": "2024-04-11T03:03:25.768571Z",
     "iopub.status.idle": "2024-04-11T03:03:25.978553Z",
     "shell.execute_reply": "2024-04-11T03:03:25.977977Z",
     "shell.execute_reply.started": "2024-04-11T03:03:25.768857Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/mnt/workspace/libs\n"
     ]
    }
   ],
   "source": [
    "!pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f25cbc62-6044-4c08-8c82-d5388e726ea7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
